library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Read in the data
exercise_data <- read_csv("Data/visualize_data.csv")
## New names:
## Rows: 142 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (4): ...1, ...2, Exercise, BMI
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## • `...1` -> `...2`
glimpse(exercise_data)
## Rows: 142
## Columns: 4
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ ...2 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Exercise <dbl> 55.3846, 51.5385, 46.1538, 42.8205, 40.7692, 38.7179, 35.6410…
## $ BMI <dbl> 1.8320590, 1.7892194, 1.7321050, 1.6178724, 1.5036362, 1.3751…
I expect that people who record more exercise will have lower BMIs.
cor(exercise_data$Exercise, exercise_data$BMI)
## [1] -0.06447185
The correlation coefficient is negative, which indicates that an increase in recorded exercise time is associated with a decrease in BMI.
exercise_data %>%
ggplot(aes(x = Exercise,
y = BMI)) +
geom_point()
## Question 2
library(causact)
glimpse(corruptDF)
## Rows: 174
## Columns: 7
## $ country <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Argentina"…
## $ region <chr> "Asia Pacific", "East EU Cemt Asia", "MENA", "SSA", "Ameri…
## $ countryCode <chr> "AFG", "ALB", "DZA", "AGO", "ARG", "ARM", "AUS", "AUT", "A…
## $ regionCode <chr> "AP", "ECA", "MENA", "SSA", "AME", "ECA", "AP", "WE/EU", "…
## $ population <int> 35530081, 2873457, 41318142, 29784193, 44271041, 2930450, …
## $ CPI2017 <int> 15, 38, 33, 19, 39, 35, 77, 75, 31, 65, 36, 28, 68, 44, 75…
## $ HDI2017 <dbl> 0.498, 0.785, 0.754, 0.581, 0.825, 0.755, 0.939, 0.908, 0.…
?corruptDF
CPI2017 is a country’s (or territory’s) score on the Corruption Perceptions Index in 2017, which measures the perception of the level of corruption in the public sector. Scores range from 0 to 100, with 0 meaning that a given country is percieved as being very corrupt, and a score of 100 meaning that the country is not percieved as corrupt in the public sector.
HDI2017 is a country’s or territory’s Human Development Index score from 2017, which represents human development based on life expectancy, education levels, and income.
corruptDF %>%
ggplot(aes(x=HDI2017,
y=CPI2017)) +
geom_point()
There seems to be a positive association between HDI and CPI scores in 2017. This relationship doesn’t look perfectly linear and may be best represented with a quadratic function.
corruptDF %>%
ggplot(aes(x=HDI2017,
y=CPI2017)) +
geom_point() +
geom_smooth(method="gam")
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
corruptDF %>%
ggplot(aes(x=HDI2017,
y=CPI2017)) +
geom_point() +
geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
The gam method fits a generalized additive model, which
is more flexible and fits this data much better than the lm
method, which fits a simple linear model and thus renders a straight
line.
corruptDF %>%
ggplot(aes(x=HDI2017,
y=CPI2017,
color=region,
fill=region)) +
geom_point() +
geom_smooth(method="gam")
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
Haha, this is way too cluttered. Let’s try again…
corruptDF %>%
ggplot(aes(x=HDI2017,
y=CPI2017)) +
geom_point() +
geom_smooth(method="gam") +
facet_wrap(~region)
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
This is much more digestible.
corruptDF %>%
ggplot(aes(x=HDI2017,
y=CPI2017)) +
geom_point() +
geom_smooth(method="gam") +
scale_x_reverse()
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
Done, though I don’t particularly like it.
final_plot <- corruptDF %>%
ggplot(aes(x=HDI2017,
y=CPI2017)) +
geom_point() +
geom_smooth(method="gam") +
labs(x="Human Development Index Score",
y="Corruption Perceptions Index Score",
title="Human Development and Corruption Perception in 2017",
subtitle="Data points are countries",
caption="Source: causact package in R")
#setwd("/Users/ajg109//Documents/Github/sociol_722/HW3")
#ggsave("hw3_plot.pdf", plot=final_plot)
I commented this out because it was preventing knitting but I promise it worked and I have since used ggsave to successfully save a graph I made for work!
library(tidyverse)
# Read in the data
tv_ratings <- read_csv("Data/tv_ratings.csv")
## Rows: 2266 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): titleId, title, genres
## dbl (3): seasonNumber, av_rating, share
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Glimpse the data
glimpse(tv_ratings)
## Rows: 2,266
## Columns: 7
## $ titleId <chr> "tt2879552", "tt3148266", "tt3148266", "tt3148266", "tt31…
## $ seasonNumber <dbl> 1, 1, 2, 3, 4, 1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 1, 1, 1, 1, …
## $ title <chr> "11.22.63", "12 Monkeys", "12 Monkeys", "12 Monkeys", "12…
## $ date <date> 2016-03-10, 2015-02-27, 2016-05-30, 2017-05-19, 2018-06-…
## $ av_rating <dbl> 8.4890, 8.3407, 8.8196, 9.0369, 9.1363, 8.4370, 7.5089, 8…
## $ share <dbl> 0.51, 0.46, 0.25, 0.19, 0.38, 2.38, 2.19, 6.67, 7.13, 5.8…
## $ genres <chr> "Drama,Mystery,Sci-Fi", "Adventure,Drama,Mystery", "Adven…
tv_long <- tv_ratings %>%
group_by(title) %>%
summarise(num_seasons = n()) %>%
ungroup() %>%
left_join(tv_ratings, by = "title")
tv_long <- tv_long %>%
filter(num_seasons >= 5)
tv_long %>%
ggplot(aes(x=seasonNumber,
y=av_rating,
group=title)) +
geom_line() +
labs(x="Season Number", y="Average Rating") +
theme_minimal()
This graph is extremely messy and it is difficult to draw any conclusions.
tv_long %>%
ggplot(aes(x=seasonNumber,
y=av_rating,
group=title)) +
geom_line() +
facet_wrap(~genres) +
labs(x="Season Number", y="Average Rating") +
theme_minimal()
The following genres of shows seem to last especially long: drama, romance and crime, drama, mystery. Ratings do seem to change across seasons.
tv_long %>%
filter(genres=="Drama,Family,Fantasy") %>%
dplyr::select(title)
## # A tibble: 7 × 1
## title
## <chr>
## 1 Are You Afraid of the Dark?
## 2 Are You Afraid of the Dark?
## 3 Are You Afraid of the Dark?
## 4 Are You Afraid of the Dark?
## 5 Are You Afraid of the Dark?
## 6 Are You Afraid of the Dark?
## 7 Are You Afraid of the Dark?
I assumed that the show that plummeted would be GOT, but it is actually “Are You Afraid of the Dark?”.
tv_best <- tv_ratings %>%
filter(av_rating>=9)
tv_best %>%
ggplot(aes(x=genres)) +
geom_bar() + theme_minimal()
First try, impossible to read the x-axis.
tv_best %>%
ggplot(aes(x=genres)) +
geom_bar() +
coord_flip() +
theme_minimal()
labs(title="Number of Show Seasons with an Avereage Rating of 9/10 or Greater by Genre")
## $title
## [1] "Number of Show Seasons with an Avereage Rating of 9/10 or Greater by Genre"
##
## attr(,"class")
## [1] "labels"
Wow, much better. coord_flip flips the coordinates so
that the original x-axis is shown on the y-axis and vice versa. Drama is
the genre with the most top-rated shows.
comedies_dramas <- tv_ratings %>%
mutate(is_comedy = if_else(str_detect(genres, "Comedy"),
1,
0)) %>% # If it contains the word comedy then 1, else 0
filter(is_comedy == 1 | genres == "Drama") %>% # Keep comedies and dramas
mutate(genres = if_else(genres == "Drama", # Make it so that we only have those two genres
"Drama",
"Comedy"))
glimpse(comedies_dramas)
## Rows: 684
## Columns: 8
## $ titleId <chr> "tt0312081", "tt0312081", "tt0312081", "tt1225901", "tt12…
## $ seasonNumber <dbl> 1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 1, 25, 1, 1, 2, 3, 4, 5, 1,…
## $ title <chr> "8 Simple Rules", "8 Simple Rules", "8 Simple Rules", "90…
## $ date <date> 2002-09-17, 2003-11-04, 2004-11-12, 2009-01-03, 2009-11-…
## $ av_rating <dbl> 7.5000, 8.6000, 8.4043, 7.1735, 7.4686, 7.6858, 6.8344, 7…
## $ share <dbl> 0.03, 0.10, 0.06, 0.40, 0.14, 0.10, 0.04, 0.01, 0.48, 0.4…
## $ genres <chr> "Comedy", "Comedy", "Comedy", "Comedy", "Comedy", "Comedy…
## $ is_comedy <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
comedies_dramas %>%
ggplot(aes(x=av_rating,
fill=genres,
color=genres)) +
geom_density(alpha=.3) +
labs(x="Average Rating")
No, sorry Nico, dramas have a slightly higher peak (most common rating), and also have more extremely highly rated tv seasons than do comedies.
comedies_dramas %>%
ggplot(aes(x=av_rating,
fill=genres,
color=genres)) +
geom_histogram(alpha=.3) +
labs(x="Average Rating")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram is helpful because it shows raw counts and makes it clear that we have many more comedies in our dataset than dramas. However, it is difficult to compare the relative frequencies of different ratings.
comedies_dramas %>%
ggplot(aes(x=av_rating,
fill=genres,
color=genres)) +
geom_freqpoly(alpha=.7) +
labs(x="Average Rating")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This plot seems to show the same information as the histogram except using lines in place of bars. I think that for our purposes, the density plot is most informative because it accounts for the differences in the number of observations for each genre.
comedies_dramas %>%
ggplot(aes(x=av_rating, y=share)) +
geom_bin_2d() +
labs(x="Average Rating")
This gives you information about the relationship between two different variables and about the distribution of tv shows along these dimensions. It also addresses issues related to overplotting in a scatterplot as the fill gives information about the count of shows within each 2d bin. I’m curious about the show with such a viewer share.
comedies_dramas %>%
ggplot(aes(x=av_rating, y=share, fill=genres)) +
geom_bin_2d() +
labs(x="Average Rating")
Comedy shows with lower ratings tend to have higher viewership shares than do dramas with higher ratings, which is interesting (besides the one outlier).
mystery_show <- comedies_dramas %>%
filter(share>20)
The show is “Dekalog.” Never heard of it…
got <- tv_ratings %>%
filter(title=="Game of Thrones")
got %>%
ggplot(aes(x=seasonNumber,
y=av_rating)) +
geom_line(color="red") +
labs(x="Season",
y="Average Rating",
title="Game of Thrones Average Rating by Season") +ylim(8, 10)
library(tidyverse)
# Read in the data
wncaa <- read_csv("Data/wncaa.csv")
## Rows: 2092 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): school, conference, conf_place, how_qual, x1st_game_at_home, tourn...
## dbl (13): year, seed, conf_w, conf_l, conf_percent, reg_w, reg_l, reg_percen...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Glimpse the data
glimpse(wncaa)
## Rows: 2,092
## Columns: 19
## $ year <dbl> 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982…
## $ school <chr> "Arizona St.", "Auburn", "Cheyney", "Clemson", "Drak…
## $ seed <dbl> 4, 7, 2, 5, 4, 6, 5, 8, 7, 7, 4, 8, 2, 1, 1, 2, 3, 6…
## $ conference <chr> "Western Collegiate", "Southeastern", "Independent",…
## $ conf_w <dbl> NA, NA, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ conf_l <dbl> NA, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ conf_percent <dbl> NA, NA, NA, 66.7, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ conf_place <chr> "-", "-", "-", "4th", "-", "-", "-", "-", "-", "-", …
## $ reg_w <dbl> 23, 24, 24, 20, 26, 19, 21, 14, 21, 28, 24, 17, 22, …
## $ reg_l <dbl> 6, 4, 2, 11, 6, 7, 8, 10, 8, 7, 5, 13, 7, 5, 1, 6, 4…
## $ reg_percent <dbl> 79.3, 85.7, 92.3, 64.5, 81.3, 73.1, 72.4, 58.3, 72.4…
## $ how_qual <chr> "at-large", "at-large", "at-large", "at-large", "aut…
## $ x1st_game_at_home <chr> "Y", "N", "Y", "N", "Y", "N", "N", "N", "N", "N", "Y…
## $ tourney_w <dbl> 1, 0, 4, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 1, 5, 3, 1, 1…
## $ tourney_l <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1…
## $ tourney_finish <chr> "RSF", "1st", "N2nd", "1st", "RF", "1st", "1st", "1s…
## $ full_w <dbl> 24, 24, 28, 20, 28, 19, 21, 14, 21, 28, 26, 17, 24, …
## $ full_l <dbl> 7, 5, 3, 12, 7, 8, 9, 11, 9, 8, 6, 14, 8, 6, 1, 7, 5…
## $ full_percent <dbl> 77.4, 82.8, 90.3, 62.5, 80.0, 70.4, 70.0, 56.0, 70.0…
tourney_wins <- wncaa %>%
filter(tourney_finish=="Champ") %>%
group_by(school) %>%
summarize(N=n()) %>%
mutate(freq=N/ sum(N),
pct= round((freq*100), 0))
tourney_wins %>%
ggplot(aes(x=reorder(school, pct), y=pct)) +
geom_col(position="dodge") +
coord_flip() +
labs(x=NULL, y="Percent",
title="Share of WNCAA Tournament Wins by School") +
theme_minimal()
Together, UConn and Tennessee have won the majority of WNCAA
tournaments. The rest of these teams have won a significantly lower
percentage of tournaments.
champ_names <- unique(tourney_wins$school)
winners <- wncaa %>%
filter(school %in% champ_names)
winners %>% ggplot(aes(x=reorder(school, seed, na.rm=TRUE),
y=seed)) +
geom_boxplot() +
coord_flip() +
labs(x=NULL) +
theme_minimal()
Honestly, don’t know what seeds are. Based on my brief read on the subject we would expect UConn and Tennessee to have low seeds. Interesting that Notre Dame has the second highest mean and median given they are in the second tier of schools based on the graph from Q1.
winners %>% ggplot(aes(x=reorder(school, seed, na.rm=TRUE),
y=seed)) +
geom_violin() +
coord_flip() +
labs(x=NULL) +
theme_minimal()
I personally find the boxplot more informative and digestible but I appreciate that the violin plot also shows the full distribution of data.
winners %>% ggplot(aes(x=reorder(school, seed, na.rm=TRUE),
y=seed)) +
geom_point() +
coord_flip() +
labs(x=NULL) +
theme_minimal()
This is not helpful because seed only takes interger values and it’s impossible to tell how many observations occurred at each seed value; this only shows if there was at least one seed set at a given value.
winners_sts <- winners %>% group_by(school) %>%
summarize_if(is.numeric, funs(mean, sd), na.rm = TRUE) %>%
ungroup()
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
winners_sts %>%
ggplot(aes(x=reorder(school, reg_percent_mean),
y=reg_percent_mean)) +
geom_point(size=3) +
coord_flip() +
theme_minimal() +
labs(x=NULL,
y="average win percentage")
Interesting that while UConn remains in the top two in terms of average win percentage, Tennessee is not in the top two based on this statistic.
winners_sts %>%
ggplot(aes(x=reorder(school, reg_percent_mean),
y=reg_percent_mean)) +
geom_pointrange(aes(ymin=reg_percent_mean-reg_percent_sd,
ymax=reg_percent_mean+reg_percent_sd)) +
coord_flip() +
theme_minimal() +
labs(x=NULL,
y="average win percentage and standard deviation")
Texas A&M has the most narrow interval.
p <- winners_sts %>%
ggplot(aes(x=reorder(school, reg_percent_mean),
y=reg_percent_mean)) +
geom_point(size=3)
p + geom_linerange(aes(ymin=reg_percent_mean-reg_percent_sd,
ymax=reg_percent_mean+reg_percent_sd)) +
coord_flip() +
theme_minimal() +
labs(x=NULL,
y="average win percentage and standard deviation")
winners %>% ggplot(aes(x=reg_percent,
y=full_percent)) +
geom_point(alpha=.3) +
geom_abline() +
theme_minimal() +
xlim(0,100) +
ylim(0,100) +
labs(x="WNCAA Regular Season % Wins",
y="WNCAA Full Season % Wins")
Most dots are below the line as expected. Dots above the line occur mostly at the upper bounds of reg_percent, so it seems that teams that overperform at tournaments also tend to do quite well during the regular season.
winners <- winners %>%
mutate(is_champ = if_else(tourney_finish == "Champ", 1, 0),
is_champ = as.factor(is_champ))
winners %>% ggplot(aes(x=reg_percent,
y=full_percent,
color=is_champ)) +
geom_point(alpha=.3) +
geom_abline() +
theme_minimal() +
xlim(0,100) +
ylim(0,100) +
labs(x="WNCAA Regular Season % Wins",
y="WNCAA Full Season % Wins")
Wow, almost all of the dots above the line represent teams that were champions in their tournament that year. This makes sense because they won the tournament which is why their full season percentage of wins is higher than the regular season percentage.
winners <- winners %>%
mutate(is_champ = if_else(tourney_finish == "Champ", 1, 0))
winners %>% ggplot(aes(x=reg_percent,
y=full_percent,
color=is_champ)) +
geom_point(alpha=.3) +
geom_abline() +
theme_minimal() +
xlim(0,100) +
ylim(0,100) +
labs(x="WNCAA Regular Season % Wins",
y="WNCAA Full Season % Wins")
Without knowing that is_champ is a factor variable, R assumes that it is continuous and colors points with a continuous color scale. You can still see the same patterns but the legend would be confusing for someone else to see because it implies that is_champ can take on values other than 0 or 1, which is not the case.
winners <- winners %>%
mutate(plot_label = paste(school, year, sep = "-"))
winners <- winners %>%
mutate(difference = full_percent - reg_percent)
library(ggrepel)
winners$plot <- winners$reg_percent<50 | (winners$reg_percent<75 & winners$is_champ==1)
p1 <- winners %>% ggplot(aes(x=reg_percent,
y=full_percent))
p1 + geom_point() +
geom_text_repel(data=subset(winners, plot),
mapping=aes(label=plot_label), nudge_x=-.2) +
geom_abline() +
theme_minimal() +
labs(x="WNCAA Regular Season % Wins",
y="WNCAA Full Season % Wins")
I have labeled the points of interest, but just those two.
winners %>% filter(reg_percent==100 & full_percent==100) %>%
dplyr::select (plot_label)
## # A tibble: 8 × 1
## plot_label
## <chr>
## 1 Texas-1986
## 2 UConn-1995
## 3 UConn-2002
## 4 UConn-2009
## 5 UConn-2010
## 6 Baylor-2012
## 7 UConn-2014
## 8 UConn-2016
Most of the observations in which teams have won 100% of regular and fulls season games are UConn, which isn’t surprising because previous questions have shown that they are an extremely strong team. Texas is the most surprising (and to a lesser extent Baylor) because they have a much lower average win percentage and have won a much lower share of tournaments than UConn.